package org.yinxue.spider.core.downloader;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yinxue.spider.core.transport.HttpClientBuilder;
import org.yinxue.spider.core.transport.HttpClientPool;

import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 下载器，用于请求网页内容
 * @author zengjian
 * @create 2018-07-12 15:51
 * @since 1.0.0
 */
public class Downloader {

    public static final String UTF_8 = "utf-8";
    public static final String GBK = "gbk";

    private static final Logger LOGGER = LoggerFactory.getLogger(Downloader.class);

    /**
     * 取网页charset正则表达式
     */
    private static final Pattern CHARSET_PATTERN = Pattern.compile("<meta [a-z-\"A-Z= /;]*charset=\"?([a-z0-9-]+)\" */?>");

    /**
     * 客户端池
     */
    private HttpClientPool httpClientPool = new HttpClientPool(new HttpClientBuilder());

    /**
     * read阻塞最大时间
     */
    private static final int SOCKET_TIME = 30000;

    /**
     * TCP3次握手连接建立最大时间
     */
    private static final int CONNECT_TIME = 3000;

    public String downloadHtml(final String url, final String proxyHost, final Integer proxyPort) {
        CloseableHttpClient client = null;
        CloseableHttpResponse response = null;
        HttpEntity httpEntity = null;
        HttpUriRequest request = null;
        try {
            client = httpClientPool.borrowObject();
            request = buildRequest(url, proxyHost, proxyPort);
            response = client.execute(request);
            if (isOK(response)) {
                httpEntity = response.getEntity();
                try (InputStream inputStream = httpEntity.getContent()) {
                    byte[] bytes = IOUtils.toByteArray(inputStream);
                    // 采用GBK解析，而后确认charset后再重新解析为正确的字符显示
                    String html = new String(bytes, GBK);
                    String charset = parseCharset(html, UTF_8);
                    String realHtml = new String(bytes, charset);
                    return realHtml;
                }
            }
            return "";
        } catch (Exception e) {
            e.printStackTrace();
            return "";
        } finally {
            try {
                if (response!=null){
                    response.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            httpClientPool.returnObject(client);
        }
    }

    private boolean isOK(CloseableHttpResponse response) {
        return response != null && response.getStatusLine() != null
                && response.getStatusLine().getStatusCode() == 200;
    }

    public String downloadHtml(final String url) {
        return downloadHtml(url, null, null);
    }

    private HttpUriRequest buildRequest(String url, String proxyHost, Integer proxyPort) {
        HttpHost httpHost = null;
        if (proxyHost != null && proxyPort != null) {
            httpHost = new HttpHost(proxyHost, proxyPort);
        }
        return buildProxyHttpGet(url, httpHost);
    }

    private HttpUriRequest buildProxyHttpGet(String url, HttpHost proxy) {
        HttpGet request = new HttpGet(url);
        request.setConfig(RequestConfig.custom()
                .setSocketTimeout(SOCKET_TIME)
                .setConnectTimeout(CONNECT_TIME)
                .setProxy(proxy)
                .build());
        return request;
    }

    private String parseCharset(String html, String defaultCode) {
        // 用正则表达式匹配
        Matcher matcher = CHARSET_PATTERN.matcher(html);
        while (matcher.find()) {
            String charset = matcher.group(1);
            return charset;
        }
        return defaultCode;
    }
}
